/*******************************************
In this final program step, I create all the
files needed for downstream processing

I do this so that they all have the same source, 
and have the same restrictions.

I need to make the following files: 

- earnings_percentile
- earnings_regressions
- earnings including self-employment


*********************************************/

        
/********************************
   This macro controls which sample 
        from PIKS is included
**********************************/
%macro sample_selection; 

    if substr(pik,1,1) in ('2')
        and year_grad > 2000
        and substr(degcip,1,2) not in ('30','10','46','60','03') 


%mend sample_selection ; 

        
%macro sample_creation ;

        
/*==========================================================
        creating two datasets, one for regressions (grads_sample) 
        and one for the annual graphs (grads_annual), 
        which will get se_earnings merged in 
============================================================*/
data grads_sample 
     grads_annual  (drop=quarter qtime national_earnings instate_earnings 
                        self_emp: ii) ; 
    set OUTPUTS.earnings_full ; 
    by pik opeid deglevl_code degcip year quarter ; 
    /*************************
    creating year_postgrad 
    *************************/
    %sample_selection;
    retain national_annual instate_annual nat_qtrsemp state_qtrsemp;
    
    if first.year then do ; 
        national_annual = 0 ; 
        instate_annual = 0 ;
        nat_qtrsemp =0 ;
        state_qtrsemp= 0 ;
     end;
     if national_earnings = . then national_earnings = 0  ;
     if instate_earnings = . then instate_earnings = 0 ; 
     national_annual = sum(national_annual,national_earnings); 
     instate_annual = sum(instate_annual,instate_earnings) ;

     nat_qtrsemp = nat_qtrsemp + (national_earnings>0) ;
     state_qtrsemp = state_qtrsemp + (instate_earnings>0) ;
        
     year_postgrad = year - year_grad ;
     if year_postgrad > 0 ;  
     
     if last.year then do ;
        output grads_annual ; 
     end;
     output grads_sample ;

run;

data seearn (drop=se_earn&starty.-se_earn&endy.)  ;
    set INPUTS.seearnings_wide ;
    by pik ; 
    
    array se_earn{&starty.:&endy.} se_earn&starty.-se_earn&endy.; 
    array s_emp{&starty.:&endy.} self_emp&starty.-self_emp&endy.;
        
    do ii = 2002 to 2016 ; 
        flag_se = s_emp{ii} ;
        se_earnings = se_earn{ii} ;
        year = ii ;
        output ;
    end;
run;
        
proc sort data=grads_annual ;
        by pik year ; 
run;
        
data all_earnings (rename=(national_annual = national_earnings 
                            instate_annual=instate_earnings));
     merge grads_annual (in=a) seearn (in=b) ;
     by pik year ;
        
     if missing(se_earnings) then se_earnings = 0 ;
     if missing(flag_se) then flag_se = 0 ; 
     if year>year_grad ; 
     
     flag_onlyse = 0 ; 
     if flag_se = 1 and national_annual =0 then flag_onlyse=1;
     if a; 
        
     total_earnings = national_annual + se_earnings ;
        
     /***********************************
        Also applying the flags here that 
        we need for sample flags in percentile
        creation
     *************************************/
     sample_all=1 ;
     sample_FTE = (national_annual> 10000 and nat_qtrsemp >=3) ;
     only_instate = (national_annual=instate_annual) ;
     any_instate = (instate_annual>0) ;
     cip_2dig = substr(degcip,1,2) ;
run;

/************************************
Merging in demographics here to avoid
missing self-employment only people
************************************/

data icf (keep=pik dob male race white black asian hispanic
              pob us_native ethnicity)/view=icf ;
    set ICF.icf_us ;
    male = (sex = 'M') ;
    white = race='1' ;
    black = race = '2' ;
    asian = race = '3' ;
    us_native = (pob = 'A');
    hispanic = (ethnicity='H') ;
run;

%demog_fromschools;

proc sort data=all_earnings;
     by pik ;
run;

data all_earnings_demog ;
     merge all_earnings (in=a) icf (in=b) demogs_fromstates (in=c) ;
     by pik;

     if a;
run;


/**********************************
Now only keeping the OPEIDs that are public
institutions
**************************************/
proc sort data=all_earnings_demog ;
     by opeid ;
run;

proc import datafile="&outpath./opeid_public.dta" out=work.opeid_public dmbs=dta replace;
run;

proc sort data=opeid_public ;
     by opeid ;
run;

data all_earnings_public ;
     merge all_earnings_demog (in=a) opeid_public (keep=opeid public) ;
     by opeid ;

     if a;
     if public = 1 or ui_state = '42'; /* PSU gets classified as not a public school,
     	       	      	       which is problematic since thats the whole state of PA for us.*/
run;

proc sort data=all_earnings_public ;
     by pik year ;
run;
        
/***************************        
Now, we need to export two datasets to STATA - 
        
   earnings_regressions (quarterly)
   earnings_regressions (annual)
        
**********************************/
        /* annual */
proc export data=all_earnings_public
        outfile="&outstata./all_earnings_long.dta" replace; 
run;
        /* quarterly */
proc export data=grads_sample 
        outfile="&outstata./earnings_regressions.dta" replace;  
run;
        
/************************************
        Also outputting SAS dataset for
        percentile graphs
*************************************/
        
data OUTPUTS.individual_earnings_merge ; 
        set all_earnings ; 
        %top_school;
run;
   
%mend sample_creation ; 

%sample_creation;
